IRIX 6.5 Applications 2001 May

home *** CD-ROM | disk | FTP | other *** search

/ IRIX 6.5 Applications 2001 May / SGI IRIX 6.5 Applications 2001 May.iso / dev / insight_dev.idb / usr / share / Insight / bin / indexgen_sgidocbk.z / indexgen_sgidocbk

Wrap

Text File | 2001-04-05 | 22.6 KB | 804 lines

#!/usr/bin/perl5 #################################################################### # # Name: indexgen_sgidocbk # # Note: PERL 5.004 or greater is required for this script. # # Function: scan an SGML file to find all instances of <indexterm> # tags and use the information contained in those tags to # create a separate file that contains an SGML index for the # SGIDOCBK DTD that begins with the <index> tag and ends with # the </index> tag. # # Author: Adrian Daley # # Other Information: # When STDIN is used for input, <indexterm> tags without id attributes # are ignored and not used in the index. When a file is used for input # the id attribute will be fixed on all <indexterm> tags that are # lacking them if the '-q' command line argument is not used. # # Version 0.5 - 9/28/98 # Initial version with most functionality and not much support for # "see" and "seealso" parts of index terms. # # Version 1.0 - 10/8/98 # Added support for "see" and "seealso" tags in <indexterm> and # <indexentry> tags. Refer to the ProcessTerm, CombineIdenticalTerms, # and PrintIndex functions for more information on how they are # handled. # # Changed the <index> structure to use <indexdiv> structures with # <title> tags to separate parts of the index that start with different # letters. # # Added -q command line option to bypass ID attribute checks # # Added additional comments and documentation # # Version 1.1 - 10/12/98 # Fixed bug related to <comment></comment> handling # # Now removes the following characters from sort as values to # insure the proper sort order: '$', '/', '.', '<', '"', '-' # # Version 1.2 - 3/1/99 # Changed CleanInput function to move any <indexterm> tags that # occur within a <title> or <tbltitle> to just before the title. # This is needed for the Inso stylesheet content() PVF. # # Removed the use of <indexdiv> tags in the output to # reduce the dependencies on specific languages # # Added some hints as to how to localize the program # # Version 1.3 - 02Mar99 (Ferg / gferg@sgi.com) # Localization work; locale map and setlocale() implemented # #################################################################### # localization; see perllocale(1) for details # # perl 5.004 can be installed from freeware.sgi.com or internally # at : hoshi.engr:/usr/local/dist/perl5/5.004/ # require 5.004; # use the locale for the life of the program # use locale; use POSIX qw(locale_h); use POSIX qw(strcoll); my($input_file, $output_file); local($verbose) = 1; my(@terms, @sorted_terms); my($total, $unique_entries); my($lang) = ''; my($locale) = ''; # Mapping from SGIDOCBK LANG attrib to supported IRIX system locales # # Note : We may want to expand the left-hand side to include additional # variations # # See : http://localize.engr/root/products/IRIXLocales/locales_6_5.html # my(%locales) = ( 'C' => 'C', # English 'en' => 'C', # English 'de' => 'de', # German 'fr' => 'fr', # French 'es' => 'es', # Spanish 'jp' => 'ja_JP.EUC', # Japanese 'ja_JP' => 'ja_JP.EUC', # Japanese 'ja_JP.EUC' => 'ja_JP.EUC', # Japanese 'ja_JP.ujis' => 'ja_JP.EUC', # Japanese 'ja_JP.eucJP' => 'ja_JP.EUC', # Japanese 'ja_JP.SJIS' => 'ja_JP.SJIS', # Japanese (shift-JIS) 'ja.SJIS' => 'ja_JP.SJIS', # Japanese (shift-JIS) 'zh_TW' => 'zh_TW.ucns', # Traditional Chinese 'zh_TW.ucns' => 'zh_TW.ucns', # Traditional Chinese 'zh_TW.EUC' => 'zh_TW.ucns', # Traditional Chinese 'zh_TW.big5' => 'zh_TW.big5', # Traditional Chinese 'zh_CN' => 'zh_CN.ugb', # Simplified Chinese 'zh_CN.ugb' => 'zh_CN.ugb', # Simplified Chinese 'zh_CN.EUC' => 'zh_CN.ugb', # Simplified Chinese 'zh_CN.gbk' => 'zh_CN.gbk', # Simplified Chinese 'zh_CN.eucgbk' => 'zh_CN.gbk', # Simplified Chinese 'ko' => 'ko_KR.euc', # Korean 'ko_KR' => 'ko_KR.euc', # Korean 'ko_KR.euc' => 'ko_KR.euc', # Korean 'ko_KR.eucKR' => 'ko_KR.euc' # Korean ); # Global delimiter value for strings # This value should be see such that it never will occur in a string $delimiter = ':%:%:'; ($input_file, $output_file, $verbose) = ProcessArgs(); # Read the input file into a single string replacing newlines with spaces if($verbose) { print STDERR "\tReading file...\n"; } $buffer = ''; while($line = <$input_file>) { chomp $line; $buffer .= $line.' '; } #remove all <comment></comment> sections in case they contain <indexterm> tags $buffer =~ s#<comment[^>]*>.*?</comment>##img; # find the locale for this document from the <sgidocbk> LANG attribute. # if($buffer =~ /<sgidocbk[^>]*LANG\s*=\s*"([^"]+)"/im) { $lang = $1; } else { $lang = "C"; } # cannot find the correct locale for the specified book LANG # if(($locale = $locales{$lang}) eq '') { print STDERR "\tWARNING: Locale for LANG attribute '$lang' was ", "not found; 'C' will be used.\n"; $locale = 'C'; } if($verbose) { print STDERR "\tUsing LANG='$lang' with locale='$locale'\n"; } # set the proper locale # # LC_CTYPE needed for uc(), lc(), ucfirst(), lcfirst() # LC_COLLATE needed for lt, le, cmp, ge, gt, strcoll(), sort() # if( !setlocale(LC_CTYPE, $locale) ) { print STDERR "\tWARNING: setlocale() for locale '$locale' failed; ", "'C' will be used.\n"; setlocale(LC_CTYPE, 'C'); setlocale(LC_COLLATE, 'C'); } else { setlocale(LC_COLLATE, $locale); } # iteratively find all of the <indexterm>s and create an index entry for them if($verbose) { print STDERR "\tParsing <indexterms>...\n"; } while($buffer =~ m#<indexterm([^>]*)>(.+?)</indexterm>#ios) { $result = ProcessTerm($1, $2); if(defined($result)) { push(@terms, $result); } # only need to look at the remainder of the buffer now $buffer = $'; } $total = $#terms + 1; if($total == 0) { print STDERR "\tNo index terms found. No index will be created.\n"; exit(0); } if($verbose) { print STDERR "\tSorting $total terms...\n"; } @sorted_terms = sort by_alpha @terms; if($verbose) { print STDERR "\tCombining duplicate terms...\n"; } $unique_entries = CombineIdenticalTerms(\@sorted_terms); if($verbose) { print STDERR "\tPrinting $unique_entries unique terms...\n"; } PrintIndex(\@sorted_terms, $output_file); print STDERR "\tFinished: $total terms found and indexed in $unique_entries unique entries.\n"; ### END MAIN PROGRAM ### ############################################################################### # # Read in the command line arguments. Open the input and output # file handles and return references to them. If the input source # is a file, then call CleanInput to check and fix the ID attribute # on the file. If the input is STDIN and/or the output is STDOUT, # references to the respective filehandle will be returned. The "-s" # argument determines if verbose status reports are not generated. # If the -q argument is used, checking and fixing attribute tags in the # input file will be bypassed for faster processing if the user knows # it is not needed. # # Returns: (filehandle input, filehandle output, boolean verbose_status) # ############################################################################### sub ProcessArgs() { local($input) = ""; local($output) = ""; local($verbose) = 1; local($do_cleanup) = 1; while($arg = shift(@ARGV)) { if($arg =~ /^-h/) { Usage(); } elsif($arg =~ /^-q/) { $do_cleanup = 0; } elsif($arg =~ /^-s/) { $verbose = 0; } elsif($arg =~ /^-i/) { $input = shift(@ARGV); if($input eq "") { Usage(); } elsif(! -e $input) { print "\nInput file doesn't exist!\n"; exit(1); } } elsif($arg =~ /^-o/) { $output = shift(@ARGV); if($output eq "") { print "\nInput file name not specified correctly"; Usage(); } } else { print STDERR "\nThe argument '$arg' is not supported."; Usage(); } } # open the input and output files if($input ne '') { if($do_cleanup) { # first clean up the file to make sure every <indexterm> as an id attribute if($verbose) { print STDERR "\tCorrecting <indexterm> id attributes...\n"; } CleanInput($input); } open(INPUT, "$input") || die "Unable to open input file: $input\n"; $input = \*INPUT; } else { if($verbose) { print STDERR "\tReading from STDIN\n"; } $input = \*STDIN; } if($output ne '') { open(OUTPUT, ">$output") || die "Unable to open output file: $output\n"; $output = \*OUTPUT; } else { if($verbose) { print STDERR "\tWriting to STDOUT\n"; } $output = \*STDOUT; } return($input, $output, $verbose); } ################################################################################# # # Parses an <indexterm> content to gather the <primary>,<secondary>,<tertiary> # terms, primary, secondary, tertiary sort as attribute values, a # single <see> tag content and the content of multiple <seealso> tags. # # After parsing the input for the above values, all of the values are cleaned # up to remove extra spaces. For terms with no sort as values, the respective # term is used with all SGML tags removed. # # Input: # $term - string of the indexterm tag (ex. '<indexterm ID="???">') # $content - the entire string between the <indexterm> and </indexterm> # # Return Value: a reference to an array with the following information. If # a valid indexterm is not found, the value undef is returned. # # 0 - the indexterm's ID attribute value - if a <see> value was # found, this value will be ''. # 1 - <primary> tag content # 2 - <secondary> tag content # 3 - <tertiary> tag content # 4 - primary sort as value # 5 - secondary sort as value # 6 - tertiary sort as value # 7 - <see> tag content # 8 - <seealso> tag(s) content. # Multiple values are joined with '$delimiter' # ################################################################################# sub ProcessTerm { local($term, $content) = @_; local($id, $primary, $secondary, $tertiary, $p_sort, $s_sort, $t_sort, $temp, $see, $seealso); if($term =~ m#id="([^"]+)#i) { $id = $1; } else { print STDERR "Warning: ID attribute not found in $content. Skipping...\n"; return(undef); } if($content =~ m#<primary([^>]*)>(.+?)</primary>#i) { $primary = $2; $temp = $1; if($temp =~ m#sortas="([^"]+)#i) { $p_sort = $1; } if($content =~ m#<secondary([^>]*)>(.+?)</secondary>#i) { $secondary = $2; $temp = $1; if($temp =~ m#sortas="([^"]+)#i) { $s_sort = $1; } if($content =~ m#<tertiary([^>]*)>(.+?)</tertiary>#i) { $tertiary = $2; $temp = $1; if($temp =~ m#sortas="([^"]+)#i) { $t_sort = $1; } } } # search for <see> and <seealso> tags if($content =~ m#<see[^>]*>(.+?)</see>#i) { $see = $1; } $seealso = ''; while($content =~ s#<seealso[^>]*>(.+?)</seealso>##i) { if($seealso eq '') { $seealso = $1; } else { $seealso .= "$delimiter$1"; } } } else { print STDERR "Warning: Invalid primary indexterm in $id. Skipping...\n"; $primary = ''; } if($primary ne '' && $id ne '') { # set, clean-up, and modify the ?_sort variables if($p_sort eq '') { $p_sort = $primary; } if($s_sort eq '') { $s_sort = $secondary; } if($t_sort eq '') { $t_sort = $tertiary; } @new_term = ($id, $primary, $secondary, $tertiary, $p_sort, $s_sort, $t_sort, $see, $seealso); # remove extra whitespace before and after the entries for $i (1..8) { $new_term[$i] =~ s/^\s+|\s+$//g; } for $i (4..6) { # remove extraneous tags from the sortas terms $new_term[$i] =~ s/<[^>]*>/ /g; $new_term[$i] =~ s/\s+/ /g; # remove extra characters that may affect sorting $new_term[$i] =~ s/[\/\.\$<"-]//g; # Double check to remove extra whitespace $new_term[$i] =~ s/^\s+|\s+$//g; # convert all sortas terms to lower case. $new_term[$i] = lc($new_term[$i]); } # remove the ID attribute field for terms with a "see" value if($new_term[7] ne '') { $new_term[0] = ''; } if($new_term[4] eq '') { # skip terms that only have SGML tags as their content return(undef); } else { # return a reference to the array of indexterm information return([@new_term]); } } else { return(undef); } } #################################################################### # # Sorting routine to determine the ordering of an array of # index terms. $a and $b sort items are always references to # the array described in the ProcessTerm function. All terms # are sorted using only their sort as value. # #################################################################### sub by_alpha { local($result, $count); # loop through the primary, secondary, and tertiary sort as values. # the function returns as soon as a result is found because the two terms # are not identical at some level. If they are identical, the terms are # sorted by numeric reference values to make the sort well defined. $result = 0; $count = 1; while($result == 0 && $count <= 3) { $result = strcoll($a->[$count+3], $b->[$count+3]); $count += 1; } if($result != 0) { return($result); } else { # last ditch sorting on otherwise equivalent terms to make # sure the sort order is well defined. return (strcoll($a, $b)); } } #################################################################### # # CombineIdenticalTerms looks for duplicate index entries # (the same index term at more than one location in the book) # and combines it into a single entry that will appear in the # index with multiple references. This function assumes the # array of terms has already been sorted so that identical terms # occur consecutively. # # Two terms are considered identical if their primary, secondary, # and tertiary sortas terms are identical. If the sortas term # is not explictly specified, it is the term with all SGML markup # removed. # # Terms are combined by joining their ID attribute values with # $delimiter in the 0 array location. Then the "see" and "see also" # attributes are combined. Finally, the extra occurance of the # term is removed from the array by setting its value to undef. # # Note: Combined terms with different SGML markup in the index term # will use the SGML tags for the first occurence of the tag. # #################################################################### sub CombineIdenticalTerms { local($terms) = @_; local($count); local($term, $prev_term); $count = $#terms + 1; $prev_term = $terms->[0]; for $i (1 .. $#terms) { $term = $terms->[$i]; if($prev_term->[4] eq $term->[4] && $prev_term->[5] eq $term->[5] && $prev_term->[6] eq $term->[6]) { # combine "see" values if($term->[7] ne '') { if($prev_term->[7] ne '') { $prev_term->[7] .= "$delimiter$term->[7]"; } else { $prev_term->[7] = $term->[7]; } } # combine the "see also" values if($term->[8] ne '') { if($prev_term->[8] ne '') { $prev_term->[8] .= "$delimiter$term->[8]"; } else { $prev_term->[8] = $term->[8]; } } # combine the ID attribute values if($term->[0] ne '') { if($prev_term->[0] ne '') { $prev_term->[0] .= "$delimiter$term->[0]"; } else { $prev_term->[0] = $term->[0]; } } # remove the duplicate term $terms->[$i] = undef; #decrease the count of unique entries --$count; } else { $prev_term = $term; } } return($count); } ########################################################################## # # Prints the SGIDOCBK SGML index. The output file contains a valid # index starting with an <index> tag and closing with a </index> tag # # The overall format of the index is: # <INDEX> # <INDEXENTRY></INDEXENTRY> (one entry for each term) # </INDEX> # # where index entries are formatted as: # <INDEXENTRY> # <PRIMARYIE></PRIMARYIE> - 1 and only 1 # <SEEIE></SEEIE> - 0 or more # <SEEALSOIE></SEEALSOIE> - 0 or more # <SECONDARYIE></SECONDARYIE> - 0 or more # <SEEIE></SEEIE> - 0 or more # <SEEALSOIE></SEEALSOIE> - 0 or more # <TERTIARYIE></TERTIARYIE> - 0 or more after each <SECONDARYIE> # <SEEIE></SEEIE> - 0 or more # <SEEALSOIE></SEEALSOIE> - 0 or more # </INDEXENTRY> # # Note: <XREF> tags are inserted as needed in the <PRIMARYIE>, # <SECONDARYIE>, and <TERTIARYIE> tags to create links to # the appropriate location in the book. # ########################################################################## sub PrintIndex { local($terms, $output) = @_; local($term, $count, $term_level, $prev_term); local(@current_open_term); @levels = ('IGNORED', 'PRIMARYIE', 'SECONDARYIE', 'TERTIARYIE'); print $output "<INDEX>\n\n"; for $i (0 .. $#terms) { $term = $terms->[$i]; # skip invalid, undefined entries if(! defined($term)) { next; } # determine which term level should get the <xrefs> added $count = 0; for $j (1..3) { if($term->[$j] ne '') { ++$count; } } $term_level = 1; while($term->[$term_level] ne '' && ($term_level >= 1 && $term_level <= 3)) { if($current_open_term[$term_level] eq $term->[$term_level+3]) { ++$term_level; next; } else { if($term_level == 1) { # trick to not open an <indexentry> tag # at the beginning of the document if($i > 0) { print $output "</INDEXENTRY>\n\n"; # reset open term array @current_open_term = (); } print $output "<INDEXENTRY>\n"; } } # indent tags to improve readiblity for $k (1..($term_level*3)) { print $output " "; } print $output "<$levels[$term_level]>$term->[$term_level]"; $current_open_term[$term_level] = $term->[$term_level+3]; # if we're at the proper level add the <xrefs> # $term->[0], the ID value may have multiple entries # separated by "$delimiter" if($count == $term_level && $term->[0] ne '') { @refs = split(/$delimiter/, $term->[0]); foreach $ref (@refs) { next if($ref eq ''); print $output " <XREF LINKEND=\"$ref\">"; } } print $output "</$levels[$term_level]>\n"; # print out any <SEEIE> and <SEEALSOIE> tags if($count == $term_level) { if($term->[7] ne '') { $term->[7] = RemoveDuplicates($term->[7]); @refs = split(/$delimiter/, $term->[7]); foreach $ref (@refs) { next if($ref eq ''); for $k (1..(($term_level+1)*3)) { print $output " "; } print $output "<SEEIE>$ref</SEEIE>\n"; } } if($term->[8] ne '') { $term->[8] = RemoveDuplicates($term->[8]); @refs = split(/$delimiter/, $term->[8]); foreach $ref (@refs) { next if($ref eq ''); for $k (1..(($term_level+1)*3)) { print $output " "; } print $output "<SEEALSOIE>$ref</SEEALSOIE>\n"; } } } ++$term_level; } } # close the last <indexentry> tag if any existed if($#terms != -1) { print $output "</INDEXENTRY>\n\n"; } print $output "</INDEX>\n"; } ######################################################################## # # Given a string of multiple values separated by "$delimiter" remove any # duplicate values and return a string of unique values separated by # "$delimiter" and sorted in case-insensitive, ASCII order. Any SGML tags # that may exist in the values are not used to determine identical # values. # ######################################################################## sub RemoveDuplicates { local($string) = @_; local(%hash); @parts = split(/$delimiter/, $string); foreach $part (@parts) { $key = $part; # remove SGML tags and extra spaces from the key values # used to determine if two values are identical. $key =~ s/<[^>]*>/ /g; $key =~ s/\s+/ /g; $key =~ s/^\s+|\s+$//g; $hash{$key} = $part; } # LOCALIZE: the returned values should be sorted according to the best locale return(join("$delimiter", sort { lc($a) cmp lc($b) } (values(%hash)))); } ######################################################################### # # Given a file name, the <indexterm> tags in the file will be # checked for valid ID attributes. Tags will invalid or missing # attribute values will have new, unique values provided. # # Note: to differentiate generated attributes from existing ones # all generated values start with "IG" # # Input: file name of the file that needs to be verified. # # Returns: nothing # ######################################################################### sub CleanInput { local($file) = @_; local($id_count) = 0; local($buffer) = ""; open(INPUT, "$file") || die "Can't open input file $input for clean up.\n"; while($line = <INPUT>) { $buffer .= $line; } close(INPUT); $newbuffer = ''; while($buffer =~ m#<(title|tbltitle)[^>]*>.*?<\/\1>#ims) { $newbuffer .= $`; $title = $&; $buffer = $'; $terms = ''; while($title =~ s#(<indexterm[^>]*>.*?<\/indexterm>)##ims) { $terms .= $1; # make sure the changes are saved $id_count = 1; } # remove extra newlines left over from the <indexterm> tags $title =~ s#^\n+|\n+$##imsg; $newbuffer .= "$terms\n$title"; } $newbuffer .= $buffer; $buffer = undef; @chunks = split(/(<\/indexterm[^>]*>)/im, $newbuffer); $newbuffer = undef; for $i (0 .. $#chunks) { $line = $chunks[$i]; if($line =~ /<indexterm([^>]*)>/im) { $temp = $1; if($temp =~ /id="([^"]*)"/im) { $id = $1; if($id =~ /[A-Za-z].*/) { # good id value, no changes needed } else { # correct bad ID value $id = "IG".$$.$id_count; ++$id_count; $line =~ s/(<indexterm.*id=")[^"]*/$1$id/im; } } else { # no id attribute found - create a new one $id = "IG".$$.$id_count; ++$id_count; $line =~ s/(<indexterm[^>]*)/$1 ID="$id"/i; } } $chunks[$i] = $line; } if($id_count > 0) { # need to save changes open(OUTPUT, ">$file") || die "Can't open output file: $file for clean up\n"; print OUTPUT join("", @chunks); close(OUTPUT); } } #################################################################### # # Prints the program's usage statement and exits. # #################################################################### sub Usage { $name = $0; $name =~ s#.*/##g; print <<END_USAGE; $name Version 1.3 Usage: $name [-h] [-s] [-o <FileName>] [-i <FileName>] -h Print this help message -i <filename> Read input from specified file rather than STDIN -o <filename> Write output to specified file rather than STDOUT -s Silent mode; don't print update messages to STDERR -q Quick; don't attempt to check/fix <indexterm> ID attributes Invalid <indexterm> tags will be skipped. END_USAGE exit(1); } #################################################################### # # Prints the contents of an index term entry. # This should only be used for debugging. # # Input: a reference to the entry array location # #################################################################### sub DebugEntry { print "\n----------------------------------------------\n"; local($ref) = @_; if(! defined($ref)) { print "DebugEntry - undefined reference\n"; return; } # LOCALIZE # may need to dereference collated array entries foreach $i (0..8) { print "$i - $ref->[$i]\n"; } print "-------------------------------------------------\n"; } ####################################################################